# Setup file locations to import
beer_csv_loc <- "./StartingDBs/Beers.csv"
breweries_csv_loc <- "./StartingDBs/Breweries.csv"
# Read in data
beer_data <- read.csv(beer_csv_loc, header = TRUE)
brewery_data <- read.csv(breweries_csv_loc, header = TRUE)
# Data provided by Celia
# Celia's contributions prefixed by cb_
cb_beer <- read.xlsx(file="./StartingDBs/Beers_Celia.xlsx", sheetIndex = 1,
header=TRUE, stringsAsFactors = FALSE)
cb_brewery <- read.xlsx(file="./StartingDBs/Breweries_Celia.xlsx", sheetIndex = 1,
header=TRUE, stringsAsFactors = FALSE)
Setting up the initial DB’s and reading them into a data frame.
# Create a seperate data frame to store state data
### Note: This data set counts the district of columbia as a state
num_of_breweries_by_state <- data.frame(table(brewery_data$State))
##### Celia Banks #####
#Begin prepare an interactive geo map of breweries by state
#set the map
leaflet() %>% addTiles()
#add popup for brewery information
brewery_map <- cb_brewery %>% mutate(popup_info=paste(Name,"<br/>",City,"<br>",State,"<br>"))
#tweak the map with info layout, set circles smaller to enhance visual
leaflet() %>% addTiles() %>% addCircleMarkers(data=brewery_map,
lat=~Latitude,
lng=~Longitude,
radius=~1,
popup=~popup_info)
#End prepare an interactive geo map of breweries by state#
#corresponding distribution plot of breweries by state
ggplot(brewery_map, aes(x=as.factor(State), color="blue")) +
geom_bar(color="blue") +
ggtitle("Distribution of Breweries by State") +
xlab("State") +
geom_text(stat='count', aes(label=after_stat(count)), vjust=-.25) +
theme(legend.position="none", axis.text.x = element_text(angle = 90)) +
theme(plot.title = element_text(hjust = 0.5))

# Create data frame to access state count data
num_of_brew_by_state <- data.frame(as.factor(brewery_map$State))
#corresponding distribution plot of breweries by region
ggplot(brewery_map, aes(x=as.factor(Region), color="green")) +
geom_bar(color="green") +
ggtitle("Distribution of Breweries by Region") +
xlab("Region") +
geom_text(stat='count', aes(label=after_stat(count)), vjust=-.25) +
theme(legend.position="none", axis.text.x = element_text(angle = 90)) +
theme(plot.title = element_text(hjust = 0.5))

##### END CB #####
The number of breweries per state are as follows: AK:7, AL:3, AR:2, AZ:11, CA:39, CO:47, CT:8, DC:1, DE:2, FL:15, GA:7, HI:4, IA:5, ID:5, IL:18, IN:22, KS:3, KY:4, LA:5, MA:23, MD:7, ME:9, MI:32, MN:12, MO:9, MS:2, MT:9, NC:19, ND:1, NE:5, NH:3, NJ:3, NM:4, NV:2, NY:15, OH:15, OK:6, OR:29, PA:25, RI:5, SC:4, SD:1, TN:3, TX:28, UT:4, VA:16, VT:10, WA:23, WI:20, WV:1, WY:4
#the last six observations to check the merged file
# Change column name of beer_data to match brewery_data to use as a primary key
colnames(beer_data)[5] <- "Brew_ID"
# Merge two data bases using merge
full_brew_data <- merge(brewery_data, beer_data, by="Brew_ID")
# Rename columns 2 and 5 that were changed during the merge
colnames(full_brew_data)[2] <- "Brewery"
colnames(full_brew_data)[5] <- "Beer_Name"
# Print first 6 and last 6 observations
head(full_brew_data, n=6)
## Brew_ID Brewery City State Beer_Name Beer_ID ABV IBU
## 1 1 NorthGate Brewing Minneapolis MN Pumpion 2689 0.060 38
## 2 1 NorthGate Brewing Minneapolis MN Stronghold 2688 0.060 25
## 3 1 NorthGate Brewing Minneapolis MN Parapet ESB 2687 0.056 47
## 4 1 NorthGate Brewing Minneapolis MN Get Together 2692 0.045 50
## 5 1 NorthGate Brewing Minneapolis MN Maggie's Leap 2691 0.049 26
## 6 1 NorthGate Brewing Minneapolis MN Wall's End 2690 0.048 19
## Style Ounces
## 1 Pumpkin Ale 16
## 2 American Porter 16
## 3 Extra Special / Strong Bitter (ESB) 16
## 4 American IPA 16
## 5 Milk / Sweet Stout 16
## 6 English Brown Ale 16
tail(full_brew_data, n=6)
## Brew_ID Brewery City State
## 2405 556 Ukiah Brewing Company Ukiah CA
## 2406 557 Butternuts Beer and Ale Garrattsville NY
## 2407 557 Butternuts Beer and Ale Garrattsville NY
## 2408 557 Butternuts Beer and Ale Garrattsville NY
## 2409 557 Butternuts Beer and Ale Garrattsville NY
## 2410 558 Sleeping Lady Brewing Company Anchorage AK
## Beer_Name Beer_ID ABV IBU Style Ounces
## 2405 Pilsner Ukiah 98 0.055 NA German Pilsener 12
## 2406 Porkslap Pale Ale 49 0.043 NA American Pale Ale (APA) 12
## 2407 Snapperhead IPA 51 0.068 NA American IPA 12
## 2408 Moo Thunder Stout 50 0.049 NA Milk / Sweet Stout 12
## 2409 Heinnieweisse Weissebier 52 0.049 NA Hefeweizen 12
## 2410 Urban Wilderness Pale Ale 30 0.049 NA English Pale Ale 12
# Create CSV Files for head and tail
head_output_csv <- "./output/head_beer_data.csv"
tail_output_csv <- "./output/tail_been_data.csv"
write.csv(head(full_brew_data, n=6), head_output_csv)
write.csv(tail(full_brew_data, n=6), tail_output_csv)
The first and last 6 observations were written to CSV files in the “output” file directory as well as printed in this file.
# Create two new tables where one has only ABV with no <NA>'s and the other
state_abv <- data.frame(State=full_brew_data$State, ABV=full_brew_data$ABV)
state_abv <- state_abv %>% drop_na(ABV)
# Do the same for IBU
state_ibu <- data.frame(State=full_brew_data$State, IBU=full_brew_data$IBU)
state_ibu <- state_ibu %>% drop_na(IBU)
Multiple Data sets were created throught to handle specific questions. For example; if the question asked for the comparison between ABV and IBU then any beer with an NA for either would be dropped. For a question where ABV or IBU was not needed, then all data would be retained.
##state. Plot a bar chart
# Find the average ABV for the state_abv table
avg_state_abv <- ddply(state_abv, .(State), function(x) median(x$ABV))
colnames(avg_state_abv)[2] <- "Median_ABV" # Rename the column
# Drop DC. It's not a state....yet
avg_state_abv <- avg_state_abv[-c(8),] # DC is equal to position 8 on the table
# Order the states by Decending ABV content
avg_state_abv <- avg_state_abv[order(avg_state_abv$Median_ABV, decreasing=TRUE),]
# Find Average IBU
avg_state_ibu <- ddply(state_ibu, .(State), function(x) median(x$IBU))
colnames(avg_state_ibu)[2] <- "Median_IBU" # Rename column
# Order the states by IBU Decending
avg_state_ibu <- avg_state_ibu[order(avg_state_ibu$Median_IBU, decreasing=TRUE),]
# Plot Average Alcohol by Volume per state
ggplot(avg_state_abv, aes(x=reorder(State, desc(Median_ABV)), y=Median_ABV, fill=State)) +
geom_col(show.legend = FALSE, width=.9, position="dodge") +
ggtitle("Median ABV per State") +
xlab("State") +
ylab("Median Alcohol by Volume")

# Plot Average International Bitterness Unit per State
ggplot(avg_state_ibu, aes(x=reorder(State, desc(Median_IBU)), y=Median_IBU, fill=State)) +
geom_col(show.legend = FALSE, width=.9, position="dodge") +
ggtitle("Median IBU per State") +
xlab("State") +
ylab("Median IBU")

##### Celia Code #####
merge_bb <- merge(cb_beer, cb_brewery, by.x="Brewery_id", by.y="Brew_ID")
#Drop latitude and longitude columns as no longer needed
drop <- c("Latitude","Longitude")
cb_merged_bb <- merge_bb[,!(names(merge_bb) %in% drop)]
#rename columns to avoid confusion
#cb_merged_bb <- cb_merged_bb %>% rename(Beer_Name = Name.x, Brewery_Name = Name.y)
### RAH - used a different method of renaming columns ###
colnames(cb_merged_bb)[2] <- "Beer_Name"
colnames(cb_merged_bb)[8] <- "Brewery_Name"
### End RAH ###
#get count of breweries in each state
brewery_count <- aggregate(Brewery_id ~ State, cb_merged_bb, sum)
aggregate(cb_merged_bb$Brewery_id, by=list(Category=cb_merged_bb$State), FUN=sum)
## Category x
## 1 AK 8363
## 2 AL 4208
## 3 AR 820
## 4 AZ 9815
## 5 CA 46773
## 6 CO 65472
## 7 CT 9222
## 8 DC 1824
## 9 DE 857
## 10 FL 15781
## 11 GA 4662
## 12 HI 8521
## 13 IA 11756
## 14 ID 8605
## 15 IL 15231
## 16 IN 12855
## 17 KS 2899
## 18 KY 1334
## 19 LA 4817
## 20 MA 21195
## 21 MD 4789
## 22 ME 8142
## 23 MI 15130
## 24 MN 8687
## 25 MO 10063
## 26 MS 1698
## 27 MT 16604
## 28 NC 20439
## 29 ND 1008
## 30 NE 8855
## 31 NH 1508
## 32 NJ 1892
## 33 NM 5103
## 34 NV 4653
## 35 NY 19400
## 36 OH 7974
## 37 OK 5483
## 38 OR 26890
## 39 PA 26078
## 40 RI 5802
## 41 SC 3108
## 42 SD 1491
## 43 TN 2073
## 44 TX 25427
## 45 UT 7289
## 46 VA 11011
## 47 VT 6459
## 48 WA 25471
## 49 WI 18638
## 50 WV 314
## 51 WY 4438
#impute the rows with zeroes
cb_merged_bb$ABV[is.na(cb_merged_bb$ABV)] <- 0
cb_merged_bb$IBU[is.na(cb_merged_bb$IBU)] <- 0
#compute median ABV and IBU for each state
cb_medianABV <- aggregate(ABV ~ State + Region, cb_merged_bb, median)
cb_medianIBU <- aggregate(IBU ~ State + Region, cb_merged_bb, median)
#merge the medians dataframes
cb_mergeABVIBU <- merge(cb_medianABV, cb_medianIBU, by="State")
drop <- c("Region.y")
cb_mergeABVIBU = merge_bb[,!(names(merge_bb) %in% drop)]
#Visualize ABV v IBU medians by state and by region
options(scipen = 999)
cb_mergeABVIBU <- as.data.frame(cb_mergeABVIBU)
#by state
ggplot(cb_mergeABVIBU, aes(ABV, IBU, color=State)) +
geom_point(show.legend = FALSE) +
facet_wrap(vars(State))
## Warning: Removed 1005 rows containing missing values (geom_point).

#by region
ggplot(cb_mergeABVIBU, aes(ABV, IBU, color=Region)) +
geom_point() +
facet_wrap(vars(Region))
## Warning: Removed 1005 rows containing missing values (geom_point).

##### END #####
Celia was able to create a ABV vs IBU for state and regions. This gave us additional data to make assumptions from.
# Get the State with the highest average ABV
state_max_abv <- full_brew_data[which.max(full_brew_data$ABV),] # CO
state_max_abv[c(4,7)]
## State ABV
## 384 CO 0.128
# Get the State with the highest average IBU
state_max_ibu <- full_brew_data[which.max(full_brew_data$IBU),] # OR
state_max_ibu[c(4,8)]
## State IBU
## 1857 OR 138
Colorado has the highest percentage of alcohal by volume while Oregon has the highest IBU. Keep in mind the question was asking for the max and not the max average.
#and other types of Ale. Use KNN.
# Create a data frame with only IPA's
#df1[grep("dog",df1$type),]
ipa_beers <- beer_data_no_na[grep("IPA",beer_data_no_na$Style),]
# Change the Sytle column to only have the value of IPA for easier model training
ipa_beers$Style <- "IPA"
ales_beer <- beer_data_no_na[grep("Ale",beer_data_no_na$Style),]
# Change the Sytle column to only have the value of Ale for easier model training
ales_beer$Style <- "Ale"
# Combine the two data sets
ipas_and_ales <- rbind(ipa_beers, ales_beer)
# plot the data to get an initial view of the data
ggplot(ipas_and_ales, aes(x=ABV, y=IBU, colour=Style)) + geom_point()

# Create empty train and test data frames
ipas_and_ales_train <- ipas_and_ales # train data needs to be a duplicate of orig
ipas_and_ales_test <- data.frame()
# Split 70/30 (Train/Test) #
# Find the 30% value
samples_to_take <- round(as.numeric(nrow(ipas_and_ales_train)*.3), digits=0)
for (row in 1:samples_to_take) {
tmp_row <- sample_n(ipas_and_ales_train, 1) # Take sample
# Add Sample to test data
ipas_and_ales_test <- ipas_and_ales_test %>% rbind(tmp_row)
# Remove Sample from train data
ipas_and_ales_train <- ipas_and_ales_train[!(ipas_and_ales_train$Beer_ID==tmp_row$Beer_ID),]
}
# Declare how many interations to run
k_iterations <- 20
# find an optimal K value for the KNN model
style_accuracy <- data.frame(accuracy=numeric(k_iterations), k=numeric(k_iterations))
for (iter in 1:k_iterations) {
# Note: [,c(3,4)] represent the ABV and IBU columns
style_class <- knn(ipas_and_ales_train[,c(3,4)],
ipas_and_ales_test[,c(3,4)],
ipas_and_ales_train$Style,
prob=TRUE,
k=iter)
table(ipas_and_ales_test$Style, style_class)
cm <- confusionMatrix(table(ipas_and_ales_test$Style, style_class))
style_accuracy$accuracy[iter] <- cm$overall[1]
style_accuracy$k[iter] <- iter
}
#plot(style_accuracy$k, style_accuracy$accuracy, type="l", xlab="k")
plot_ly(style_accuracy, x=style_accuracy$k, y=style_accuracy$accuracy,
type="scatter", mode="lines")
# On average this loop gives me an optimal k level of k=5,6,7
# Use 6 as the choice k
optimal_k <- 6
################################################################################
# START Multi-Class prediction confidence with Heatmap
# Use existing Training and Test Data Frames
# assign IBU column values as numeric to keep it the same as ABV
ipas_and_ales_train$IBU <- as.numeric(ipas_and_ales_train$IBU)
ipas_and_ales_test$IBU <- as.numeric(ipas_and_ales_test$IBU)
# classification models require the outcome to be of type factor()
ipas_and_ales_train$Style <- as.factor(ipas_and_ales_train$Style)
ipas_and_ales_test$Style <- as.factor(ipas_and_ales_test$Style)
# grid config
grid_mesh_size <- .02
grid_margin <- 1
# Create a mesh grid to run the model
grid_l_min <- min(ipas_and_ales$IBU) - grid_margin
grid_l_max <- max(ipas_and_ales$IBU) + grid_margin
grid_w_min <- min(ipas_and_ales$ABV) - grid_margin
grid_w_max <- max(ipas_and_ales$ABV) + grid_margin
grid_l_range <- seq(grid_l_min, grid_l_max, grid_mesh_size)
grid_w_range <- seq(grid_w_min, grid_w_max, grid_mesh_size)
beer_grid <- meshgrid(grid_l_range, grid_w_range)
grid_ll <- beer_grid$X
grid_ww <- beer_grid$Y
# Create (another) classifier and run on the grid
# Reuse the optimal k-value from ealier for the neighbors argument
beer_model <- nearest_neighbor(neighbors=optimal_k, weight_func="inv") %>%
set_engine("kknn") %>%
set_mode("classification") %>%
fit(formula=Style ~ ABV + IBU, data=ipas_and_ales_train)
grid_ll_1 <- matrix(grid_ll, length(grid_ll), 1)
grid_ww_1 <- matrix(grid_ww, length(grid_ww), 1)
grid_final <- data.frame(grid_ll_1, grid_ww_1)
colnames(grid_final) <- c("IBU", "ABV")
grid_pred <- beer_model %>% predict(grid_final, type='prob')
dim_value <- dim(grid_ll)
prob_ipa <- matrix(grid_pred$.pred_IPA, dim_value[1], dim_value[2])
prob_ale <- matrix(grid_pred$.pred_Ale, dim_value[1], dim_value[2])
# Get the classifier Confidence
ale_ipa_z <- array(c(prob_ipa, prob_ale), dim=c(dim_value[1], dim_value[2],3))
diff <- aaply(ale_ipa_z, c(1,2), max) - (aaply(ale_ipa_z, c(1,2), sum) -
aaply(ale_ipa_z, c(1,2), max))
# Overlay the heatmap of the confidence on the scatter plot of the example
ale_ipa_fig <- plot_ly()
ale_ipa_fig <- ale_ipa_fig %>%
add_trace(data=ipas_and_ales_test,
y= ~ABV,
x= ~IBU,
symbol= ~Style,
split= ~Style,
symbols= c('square-dot', 'circle-dot'),
type= 'scatter',
mode= 'markers',
marker= list(size=12,
line=list(width=1.5),
color='lightyellow')) %>%
layout(title="Prediction Confidence on Test Split")
ale_ipa_fig <- ale_ipa_fig %>% add_trace(x=grid_l_range, y=grid_w_range, z=diff, type='heatmap')
ale_ipa_fig
6. Comment on the Summary stats and distribution of the ABV variable
Most of the beer created fell inbetween 5.0% - 6.7% ABV with a mean of 5.9% and a median of 5.6%. There is also a noticeable right skewness to the distribution.
##7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content?
According to the charts there is a obvious positive correlation between ABV and IBU. After further investigation on why this could be, we found many answers but not one that stood out. The most common answer was that the longeryou boil hops to increase the IBU, the alcohol (ABV) goes up to ballance the bitterness.